/***************************************************************************************************
 * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 *modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice,
 *this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *notice, this list of conditions and the following disclaimer in the
 *documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its
 *contributors may be used to endorse or promote products derived from this
 *software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
 *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*! \file
    \brief Unit tests for functional operators.
*/

#include "../common/cutlass_unit_test.h"

#include "cutlass/functional.h"

#include "cutlass/layout/matrix.h"
#include "cutlass/util/host_tensor.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

namespace test {
namespace core {
namespace kernel {

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Conversion template
template <typename Element, typename Operator>
__global__ void unary_operator(Element* d, Element const* a) {
    Operator op;

    *d = op(*a);
}

/// Conversion template
template <typename Element, typename Operator>
__global__ void binary_operator(Element* d, Element const* a, Element const* b,
                                int Iterations = 1) {
    Operator op;

    Element a_x = *a;
    Element b_x = *b;

    CUTLASS_PRAGMA_NO_UNROLL
    for (int i = 0; i < Iterations; ++i) {
        b_x = op(a_x, b_x);
    }

    *d = b_x;
}

/// Conversion template
template <typename Element, typename Operator>
__global__ void trinary_operator(Element* d, Element const* a, Element const* b,
                                 Element const* c, int Iterations = 1) {
    Operator op;

    Element a_x = *a;
    Element b_x = *b;
    Element c_x = *c;

    CUTLASS_PRAGMA_NO_UNROLL
    for (int i = 0; i < Iterations; ++i) {
        c_x = op(a_x, b_x, c_x);
    }

    *d = c_x;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace kernel
}  // namespace core
}  // namespace test

/////////////////////////////////////////////////////////////////////////////////////////////////

template <int kN>
void Functional_plus_f16xN() {
    using Element = cutlass::Array<cutlass::half_t, kN>;
    using Operator = cutlass::plus<Element>;

    using Tensor =
            cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor>;

    Tensor D({1, kN});
    Tensor A({1, kN});
    Tensor B({1, kN});
    Tensor C({1, kN});

    for (int i = 0; i < kN; ++i) {
        A.host_data()[i] = cutlass::half_t((i * 2 + 1) % 5);
        B.host_data()[i] = cutlass::half_t((i * 4 + 8) % 7);
        D.host_data()[i] = cutlass::half_t(0);
    }

    D.sync_device();
    A.sync_device();
    B.sync_device();

    test::core::kernel::binary_operator<Element, Operator>
            <<<dim3(1, 1), dim3(1, 1)>>>(
                    reinterpret_cast<Element*>(D.device_data()),
                    reinterpret_cast<Element const*>(A.device_data()),
                    reinterpret_cast<Element const*>(B.device_data()));

    D.sync_host();

    bool some_d_nonzero = false;

    for (int i = 0; i < kN; ++i) {
        float a = float(A.host_data()[i]);
        float b = float(B.host_data()[i]);
        float d = float(D.host_data()[i]);

        EXPECT_TRUE(d == (a + b));

        if (d != 0) {
            some_d_nonzero = true;
        }
    }

    EXPECT_TRUE(some_d_nonzero);
}

TEST(Functional, plus_f16x16) {
    Functional_plus_f16xN<16>();
}

TEST(Functional, plus_f16x17) {
    Functional_plus_f16xN<17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////

template <int kN>
void Functional_minus_f16xN() {
    using Element = cutlass::Array<cutlass::half_t, kN>;
    using Operator = cutlass::minus<Element>;

    using Tensor =
            cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor>;

    Tensor D({1, kN});
    Tensor A({1, kN});
    Tensor B({1, kN});
    Tensor C({1, kN});

    for (int i = 0; i < kN; ++i) {
        A.host_data()[i] = cutlass::half_t((i * 2 + 1) % 5);
        B.host_data()[i] = cutlass::half_t((i * 4 + 8) % 7);
        D.host_data()[i] = cutlass::half_t(0);
    }

    D.sync_device();
    A.sync_device();
    B.sync_device();

    test::core::kernel::binary_operator<Element, Operator>
            <<<dim3(1, 1), dim3(1, 1)>>>(
                    reinterpret_cast<Element*>(D.device_data()),
                    reinterpret_cast<Element const*>(A.device_data()),
                    reinterpret_cast<Element const*>(B.device_data()));

    D.sync_host();

    bool some_d_nonzero = false;

    for (int i = 0; i < kN; ++i) {
        float a = float(A.host_data()[i]);
        float b = float(B.host_data()[i]);
        float d = float(D.host_data()[i]);

        EXPECT_TRUE(d == (a - b));

        if (d != 0) {
            some_d_nonzero = true;
        }
    }

    EXPECT_TRUE(some_d_nonzero);
}

TEST(Functional, minus_f16x16) {
    Functional_minus_f16xN<16>();
}

TEST(Functional, minus_f16x17) {
    Functional_minus_f16xN<17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////

template <int kN>
void Functional_multiplies_f16xN() {
    using Element = cutlass::Array<cutlass::half_t, kN>;
    using Operator = cutlass::multiplies<Element>;

    using Tensor =
            cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor>;

    Tensor D({1, kN});
    Tensor A({1, kN});
    Tensor B({1, kN});
    Tensor C({1, kN});

    for (int i = 0; i < kN; ++i) {
        A.host_data()[i] = cutlass::half_t((i * 2 + 1) % 5);
        B.host_data()[i] = cutlass::half_t((i * 4 + 8) % 7);
        D.host_data()[i] = cutlass::half_t(0);
    }

    D.sync_device();
    A.sync_device();
    B.sync_device();

    test::core::kernel::binary_operator<Element, Operator>
            <<<dim3(1, 1), dim3(1, 1)>>>(
                    reinterpret_cast<Element*>(D.device_data()),
                    reinterpret_cast<Element const*>(A.device_data()),
                    reinterpret_cast<Element const*>(B.device_data()));

    D.sync_host();

    bool some_d_nonzero = false;

    for (int i = 0; i < kN; ++i) {
        float a = float(A.host_data()[i]);
        float b = float(B.host_data()[i]);
        float d = float(D.host_data()[i]);

        EXPECT_TRUE(d == (a * b));

        if (d != 0) {
            some_d_nonzero = true;
        }
    }

    EXPECT_TRUE(some_d_nonzero);
}

TEST(Functional, multiplies_f16x16) {
    Functional_multiplies_f16xN<16>();
}

TEST(Functional, multiplies_f16x17) {
    Functional_multiplies_f16xN<17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////

template <int kN>
void Functional_divides_f16xN() {
    using Element = cutlass::Array<cutlass::half_t, kN>;
    using Operator = cutlass::divides<Element>;

    using Tensor =
            cutlass::HostTensor<cutlass::half_t, cutlass::layout::RowMajor>;

    Tensor D({1, kN});
    Tensor A({1, kN});
    Tensor B({1, kN});
    Tensor C({1, kN});

    for (int i = 0; i < kN; ++i) {
        A.host_data()[i] = cutlass::half_t((i * 2 + 1) % 5);
        B.host_data()[i] = cutlass::half_t((i * 4 + 8) % 7);
        D.host_data()[i] = cutlass::half_t(0);
    }

    D.sync_device();
    A.sync_device();
    B.sync_device();

    test::core::kernel::binary_operator<Element, Operator>
            <<<dim3(1, 1), dim3(1, 1)>>>(
                    reinterpret_cast<Element*>(D.device_data()),
                    reinterpret_cast<Element const*>(A.device_data()),
                    reinterpret_cast<Element const*>(B.device_data()));

    D.sync_host();

    bool some_d_nonzero = false;

    for (int i = 0; i < kN; ++i) {
        float a = float(A.host_data()[i]);
        float b = float(B.host_data()[i]);
        float d = float(D.host_data()[i]);

        float expected = a / b;

        float const kThreshold = 0.0005f;

        if (std::isnan(expected)) {
            EXPECT_TRUE(std::isnan(d));
        } else if (std::isinf(expected)) {
            EXPECT_TRUE(std::isinf(d));
        } else {
            EXPECT_TRUE(std::abs(d - expected) < kThreshold)
                    << "Got: " << d << " = " << a << " / " << b
                    << ", expected: " << (a / b);
        }

        if (d != 0) {
            some_d_nonzero = true;
        }
    }

    EXPECT_TRUE(some_d_nonzero);
}

TEST(Functional, divides_f16x16) {
    Functional_divides_f16xN<16>();
}

TEST(Functional, divides_f16x17) {
    Functional_divides_f16xN<17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////

template <typename T, int kN>
void Functional_multiply_add_TxN() {
    using Element = cutlass::Array<T, kN>;
    using Operator = cutlass::multiply_add<Element>;

    using Tensor = cutlass::HostTensor<T, cutlass::layout::RowMajor>;

    Tensor D({1, kN});
    Tensor A({1, kN});
    Tensor B({1, kN});
    Tensor C({1, kN});

    for (int i = 0; i < kN; ++i) {
        A.host_data()[i] = T((i * 2 + 1) % 5);
        B.host_data()[i] = T((i * 4 + 8) % 7);
        C.host_data()[i] = T((i * 3 + 11) % 11);
        D.host_data()[i] = T(0);
    }

    D.sync_device();
    A.sync_device();
    B.sync_device();
    C.sync_device();

    test::core::kernel::trinary_operator<Element, Operator>
            <<<dim3(1, 1), dim3(1, 1)>>>(
                    reinterpret_cast<Element*>(D.device_data()),
                    reinterpret_cast<Element const*>(A.device_data()),
                    reinterpret_cast<Element const*>(B.device_data()),
                    reinterpret_cast<Element const*>(C.device_data()));

    D.sync_host();

    bool some_d_nonzero = false;

    for (int i = 0; i < kN; ++i) {
        float a = float(A.host_data()[i]);
        float b = float(B.host_data()[i]);
        float c = float(C.host_data()[i]);
        float d = float(D.host_data()[i]);

        EXPECT_TRUE(d == (a * b + c));

        if (d != 0) {
            some_d_nonzero = true;
        }
    }

    EXPECT_TRUE(some_d_nonzero);
}

/////////////////////////////////////////////////////////////////////////////////////////////////

TEST(Functional, multiply_add_f16x16) {
    Functional_multiply_add_TxN<cutlass::half_t, 16>();
}

TEST(Functional, multiply_add_f16x17) {
    Functional_multiply_add_TxN<cutlass::half_t, 17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////

TEST(Functional, multiply_add_bf16x16) {
    Functional_multiply_add_TxN<cutlass::bfloat16_t, 16>();
}

TEST(Functional, multiply_add_bf16x17) {
    Functional_multiply_add_TxN<cutlass::bfloat16_t, 17>();
}

/////////////////////////////////////////////////////////////////////////////////////////////////
